import requests
from bs4 import BeautifulSoup
import re
import json
import xlwt
import jieba
import grade
import time
import datetime

def get_data(req_url):
  # time.sleep(0.5)
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Accept-Language': 'zh-Hans-CN, zh-Hans;q=0.5'
  }
  resp = requests.get(req_url, headers=headers)
  resp.encoding = 'gbk' # 不加会产生中文乱码
  if resp.status_code == 200:
    return resp.text
  else:
    return None


def get_job_info(name, urls):
    jobs_infos = []
    company_infos = []
    i=0
    for url in urls:
        print("开始采集 "+name[i]+" 信息")
        url = str(url).replace("\/","/")
        response = get_data(url)
        if response==None or str(url).find("jobs.51job.com")==-1:
          print("采集完成 "+name[i]+" 信息失败")
          jobs_infos.append("")
          company_infos.append("")
        else:
          soup = BeautifulSoup(response, 'lxml')
          for d in soup.find_all("div", class_="tmsg inbox"):
              company_infos.append(d.get_text().strip())
          for d in soup.find_all("div", class_="bmsg job_msg inbox"):
              job_info = ""
              for p in d.find_all("p"):
                  job_info += p.get_text().strip()
              jobs_infos.append(job_info.replace("岗位职责",""))
          # time.sleep(0.2)
          print("采集完成 "+name[i]+" 信息")
        i += 1

    return company_infos,jobs_infos


def parse_data(response):
  company_name=re.findall(r'"company_name":"(.*?)"',response)  # 公司名称
  job_name=re.findall(r'"job_title":"(.*?)"',response)   #工作名称
  salary=re.findall(r'"providesalary_text":"(.*?)"',response)  #薪水待遇月薪
  work_area=re.findall(r'"workarea_text":"(.*?)"',response)  #工作地点
  company_type=re.findall(r'"companytype_text":"(.*?)"',response)  #公司类型
  issuedate=re.findall(r'"issuedate":"(.*?)"',response)[:-1]  #信息发布时间
  job_fuli=re.findall(r'"jobwelf":"(.*?)"',response)  #福利
  companysize=re.findall(r'"companysize_text":"(.*?)"',response)  #公司规模
  company_hangye=re.findall(r'"companyind_text":"(.*?)"',response)  #公司行业
  company_url=re.findall(r'"job_href":"(.*?)"',response)  #公司行业
  print("公司列表信息解析完成.....")
  company_info,job_info = get_job_info(company_name, company_url)
  print("开始对当前公司列表职能匹配程度进行智能打分....")
  job_key,job_grade = grade.getKeysList(job_info)
  print("智能打分完成....")
  return (company_name, job_name, salary, work_area, company_type, issuedate, job_fuli, companysize, company_hangye,company_info,job_info,job_key,job_grade)



def save_data(res_list,filename):
 workbook = xlwt.Workbook(encoding = 'utf-8')
 worksheet = workbook.add_sheet('RosCompany')
 titles = list(res_list.keys())
 for i in range(len(titles)):
    worksheet.write(0,i, label = titles[i])
    for j in range(len(res_list[titles[i]])):
      worksheet.write(j+1, i, label = str(res_list[titles[i]][j]).replace("\/","/") )
 workbook.save(filename+'.xls')
 print("数据已保存到"+"")


def getCompany(file):
  res_lists = {
    '公司名称': [],
    '工作名称': [],
    '薪水待遇月薪': [],
    '公司地点': [],
    '公司类型': [],
    '信息发布时间': [],
    '福利': [],
    '公司规模': [],
    '公司行业': [],
    '公司介绍': [],
    '相关工作介绍': [],
    '相关工作关键词': [],
    '技术匹配度': [],
  }
  urls = []
  print("开始读取网页列表信息....")
  with open(file,"r") as f:
      for url in f:
        urls.append(url)

  print("网页列表信息读取完成....")
  for url in urls:
    print("开始请求"+url+"公司列表信息....")
    resp_html = get_data(url)
    print("公司列表请求完成开始解析公司信息数据.....")
    res_list = parse_data(resp_html)
    i = 0
    for key in res_lists.keys():
        res_lists[key] += res_list[i]
        i += 1
  return res_lists


if __name__ == '__main__':
  print("开始爬取公司信息")
  start_time=time.time()
  data = getCompany("urls.txt")
  save_data(data,"深圳+前端+在校应届生_50人以上公司_V1.0_20210308")
  elapse=datetime.timedelta(seconds=int(time.time() - start_time))
  print("完成爬取,工爬取"+str(len(data['公司名称']))+"条信息,耗时"+str(elapse))